#
# whoVotesInPrimaries.R
#
# Analyze California voter files to determine who
# votes in primary elections, and look for
# compositional changes after implementation of
# top-two format in 2012.
#
# Seth Hill and Thad Kousser, September, 2015.
#

rm(list = ls())
if (.Platform$OS == "windows") {
  # Set working directory in location of script if Windows. On unix machine, not necessary..
  f_f <- lapply(sys.frames(),function(x) x$ofile);f_f <- Filter(Negate(is.null),f_f) ; PTH <- dirname(f_f[[length(f_f)]]); setwd(PTH) ; rm(PTH,f_f)
  try(.doit(),silent=T)
}
options(stringsAsFactors=F)
library(data.table)
library(bit64)
library(biglm)

nrows <- -1#100e4# # for debugging.
num.chunks <- 100 # number of separate chunks of data for biglm estimation.

#
# Call in data.
#
dt.stacked <- fread("whoVotesInPrimaries.csv",nrows=nrows)
# Recreate factor.
dt.stacked[,age.dec.fac := factor(age.dec.fac,levels=c("50-59","18-19","20-29","30-39","40-49","60-69","70-79"))]

#
# Table S1. Regression models, diff in diff.
#
cat("Running regression models...\n");flush.console()
library(apsrtable)
library(biglm)

cat("Estimating biglm on",num.chunks,"chunks of size",floor(nrow(dt.stacked)/num.chunks),"cases.\n")
chunk.cuts <- c(floor(seq(1,num.chunks-1)*(nrow(dt.stacked)/num.chunks)),nrow(dt.stacked))
  # Basic model.
  cat("Basic model\n");flush.console()
  base <- biglm(I(100*voted) ~ -1+ party4,data=dt.stacked[1:chunk.cuts[1],])
  #base <- update(base, dt.stacked[seq(chunk.cuts[1]+1,chunk.cuts[2]),])
  #base <- update(base, dt.stacked[seq(chunk.cuts[2]+1,chunk.cuts[3]),])
  for (i in 2:num.chunks) {
    base <- update(base,dt.stacked[seq(chunk.cuts[(i-1)]+1,chunk.cuts[i]),])
  }
  # Diff in diffs.
  # V1.
  cat("Diff in diff v1 model\n");flush.console()
  did <- biglm(I(100*voted) ~ -1+ top.two + party4,data=dt.stacked[1:chunk.cuts[1],])
  for (i in 2:num.chunks) {
    did <- update(did,dt.stacked[seq(chunk.cuts[(i-1)]+1,chunk.cuts[i]),])
  }
  # V2.
  cat("Diff in diff v2 model\n");flush.console()
  did2 <- biglm(I(100*voted) ~ -1+ party4 + party4:top.two,data=dt.stacked[1:chunk.cuts[1],])
  for (i in 2:num.chunks) {
    did2 <- update(did2,dt.stacked[seq(chunk.cuts[(i-1)]+1,chunk.cuts[i]),])
  }

# Create a list of biglm coefs and ses.
returnCoefList <- function(bl) {
  # Return a list of coefficients, ses, and p-vals from a biglm object.
  summ <- summary(bl)
  out <- list()
  for (i in 1:nrow(summ$mat)) {
    out[[rownames(summ$mat)[i]]] <- summ$mat[i,c("Coef","SE","p")]
  }
  # Add stats.
  attr(out,"n") <- summ$obj$n
  attr(out,"df.resid") <- summ$obj$df.resid
  attr(out,"nullrss") <- summ$nullrss
  attr(out,"rsq") <- summ$rsq
  out
}
all.coefs <- list("base"=returnCoefList(base),"did"=returnCoefList(did),"did2"=returnCoefList(did2))
model.names <- c("(Base)","(DID)","(DID)")
# List of variables with labels, sorted by desired row order.
var.labs <- list('top.two'='Top two primary', 'party4REP'='Party REP', 'party4DEM'='Party DEM', 'party4NPP'='Party NPP', 'party4OTH'='Party OTH', 
  'party4REP:top.two'='Top two*Party REP', 'party4DEM:top.two'='Top two*Party DEM', 'party4NPP:top.two'='Top two*Party NPP', 'party4OTH:top.two'='Top two*Party OTH'
  )
makeRows <- function(nm,all.coefs,var.labs,se.star=T,digits=2) {
  # Make a coefficient and standard error row for variable nm.
  pntf <- sprintf("%%1.%sf%%s",digits) # for sprintf below
  pntf.se <- sprintf("(%%1.%sf)",digits) # for sprintf below
  coefs <- NULL ; ses <- NULL
  for (i in 1:length(all.coefs)) {
    if (nm %in% names(all.coefs[[i]])) { # this model had this variable
      coefs <- c( coefs, sprintf(pntf, all.coefs[[i]][[nm]][1], 
                  ifelse(se.star, ifelse(all.coefs[[i]][[nm]][3] < .05, " ^*",""))) )
      ses   <- c( ses  , sprintf(pntf.se, all.coefs[[i]][[nm]][2]) )
    } else {
      coefs <- c( coefs, "      " )
      ses   <- c( ses,   "      " )
    }
  }
  # Make the two rows
  out <- c(paste(paste(c(sprintf("%15s",var.labs[[nm]]), # variable name
                   coefs),collapse= " & "),"\\\\"),
           paste(paste(c(sprintf("%15s"," "),
                   ses),collapse= " & "),"\\\\")
          )
  out
}

# Header.
fred <- c(paste(c("\\begin{tabular}{ l",rep("D{.}{.}{3}",length(all.coefs)),"}"),collapse=" "),
"\\hline",
paste(paste(c("",sprintf("\\multicolumn{ 1 }{ c }{ %s }",model.names)),collapse=" & "),"\\\\"),
"\\hline")
# Coefficient rows.
for (nm in names(var.labs)) {
  fred <- c(fred,makeRows(nm=nm,all.coefs,var.labs))
}
fred <- c(fred,"\\\\")
# Stats.
the.n <- c("$N$            ")
the.rs <- c("$R^2$          ")
for (i in 1:length(all.coefs)) {
  the.n <- c(the.n,prettyNum(attr(all.coefs[[i]],"n"),","))
  the.rs <- c(the.rs,round(attr(all.coefs[[i]],"rsq"),3))
}
fred <- c(fred,paste(paste(the.n,collapse=" & "),"\\\\"),paste(paste(the.rs,collapse=" & "),"\\\\"),
"\\hline")
# Footer.
fred <- c(fred,
sprintf("\\multicolumn{%s}{l}{\\footnotesize{Standard errors in parentheses}}\\\\",length(all.coefs)+1),
sprintf("\\multicolumn{%s}{l}{\\footnotesize{$^*$ indicates significance at $p< 0.05 $}}",length(all.coefs)+1),"\\end{tabular}")
cat(paste(fred,collapse="\n"))

write(fred,file="tablesAndFigs/TableS1.tex")

#
# Figure 1: Slopegraph plot.
#
by.pty <- dt.stacked[,list(rate=mean(voted,na.rm=T),count=sum(!is.na(voted))),by=c("top.two","party4")]
by.dec <- dt.stacked[,list(rate=mean(voted,na.rm=T),count=sum(!is.na(voted))),by=c("top.two","age.dec.fac")]
# Create plot.
.simpleCap <- function(x) {
    s <- strsplit(tolower(as.character(x)), " ")[[1]]
    paste(toupper(substring(s, 1, 1)), substring(s, 2),
          sep = "", collapse = " ")
}
# Left label at x=[1,2]. Left value at x=2.5. Lines at x=[2.5,5.5]. Right value at x=5.5.
pdf("tablesAndFigs/Figure1.pdf",width=7,height=9)
par.old <- par(mar=c(0,0,0,0))
plot(x=c(1,6),y=range(1.05*c(by.pty[,rate],by.dec[,rate])),type='n',ann=F,axes=F)
# Left labels.
by.pty[top.two == 0,text(x=2,y=rate,label=sprintf("Party %s",sapply(party4,.simpleCap)),pos=2)]
by.dec[top.two == 0,text(x=2,y=rate,label=sprintf("Age %s",age.dec.fac),pos=2)]
# Left values.
by.pty[top.two == 0,text(x=2.5,y=rate,label=sprintf("%1.1f",100*rate),pos=2)]
by.dec[top.two == 0,text(x=2.5,y=rate,label=sprintf("%1.1f",100*rate),pos=2)]
# Lines
merged <- rbindlist(list(merge(by.pty[top.two == 0,c("party4","rate"),with=F],by.pty[top.two == 1,c("party4","rate"),with=F],by="party4"),
merge(by.dec[top.two == 0,c("age.dec.fac","rate"),with=F],by.dec[top.two == 1,c("age.dec.fac","rate"),with=F],by="age.dec.fac")))
merged[,segments(x0=2.5,x1=5.5,y0=rate.x,y1=rate.y,lwd=2,col='gray')]
# Right values.
by.pty[top.two == 1,text(x=5.5,y=rate,label=sprintf("%1.1f",100*rate),pos=4)]
by.dec[top.two == 1,text(x=5.5,y=rate,label=sprintf("%1.1f",100*rate),pos=4)]
# Column labels.
text(x=2.5,y=par('usr')[4],pos=1,labels="Turnout under\nold rules")
text(x=5.5,y=par('usr')[4],pos=1,labels="Turnout under\ntop two")
par(par.old)
dev.off()
